Merge pull request #448 from knu/website_agent-text_parser

Add a parser type `text` to WebsiteAgent.

Akinori MUSHA лет %!s(int64=10): %!d(string=назад)
Родитель
Сommit
774bed2390
2 измененных файлов с 91 добавлено и 4 удалено
  1. 39 4
      app/models/agents/website_agent.rb
  2. 52 0
      spec/models/agents/website_agent_spec.rb

+ 39 - 4
app/models/agents/website_agent.rb

@@ -19,7 +19,7 @@ module Agents
19 19
 
20 20
       `url` can be a single url, or an array of urls (for example, for multiple pages with the exact same structure but different content to scrape)
21 21
 
22
-      The `type` value can be `xml`, `html`, or `json`.
22
+      The `type` value can be `xml`, `html`, `json`, or `text`.
23 23
 
24 24
       To tell the Agent how to parse the content, specify `extract` as a hash with keys naming the extractions and values of hashes.
25 25
 
@@ -40,6 +40,28 @@ module Agents
40 40
             "description": { "path": "results.data[*].description" }
41 41
           }
42 42
 
43
+      When parsing text, each sub-hash should contain a `regexp` and `index`.  Output text is matched against the regular expression repeatedly from the beginning through to the end, collecting a captured group specified by `index` in each match.  Each index should be either an integer or a string name which corresponds to `(?<_name_>...)`.  For example, to parse lines of `_word_: _definition_`, the following should work:
44
+
45
+          "extract": {
46
+            "word": { "regexp": "^(.+?): (.+)$", index: 1 },
47
+            "definition": { "regexp": "^(.+?): (.+)$", index: 2 },
48
+          }
49
+
50
+      Or if you prefer names to numbers for index:
51
+
52
+          "extract": {
53
+            "word": { "regexp": "^(?<word>.+?): (?<definition>.+)$", index: 'word' },
54
+            "definition": { "regexp": "^(?<word>.+?): (?<definition>.+)$", index: 'definition' },
55
+          }
56
+
57
+      To extract the whole content as one event:
58
+
59
+          "extract": {
60
+            "content": { "regexp": "\A(?m:.)*\z", index: 0 },
61
+          }
62
+
63
+      Beware that `.` does not match the newline character (LF) unless the `m` flag is in effect, and `^`/`$` basically match every line beginning/end.  See [this document](http://ruby-doc.org/core-#{RUBY_VERSION}/doc/regexp_rdoc.html) to learn the regular expression variant used in this service.
64
+
43 65
       Note that for all of the formats, whatever you extract MUST have the same number of matches for each extractor.  E.g., if you're extracting rows, all extractors must match all rows.  For generating CSS selectors, something like [SelectorGadget](http://selectorgadget.com) may be helpful.
44 66
 
45 67
       Can be configured to use HTTP basic auth by including the `basic_auth` parameter with `"username:password"`, or `["username", "password"]`.
@@ -140,7 +162,15 @@ module Agents
140 162
           else
141 163
             output = {}
142 164
             interpolated['extract'].each do |name, extraction_details|
143
-              if extraction_type == "json"
165
+              case extraction_type
166
+              when "text"
167
+                regexp = Regexp.new(extraction_details['regexp'])
168
+                result = []
169
+                doc.scan(regexp) {
170
+                  result << Regexp.last_match[extraction_details['index']]
171
+                }
172
+                log "Extracting #{extraction_type} at #{regexp}: #{result}"
173
+              when "json"
144 174
                 result = Utils.values_at(doc, extraction_details['path'])
145 175
                 log "Extracting #{extraction_type} at #{extraction_details['path']}: #{result}"
146 176
               else
@@ -253,10 +283,13 @@ module Agents
253 283
 
254 284
     def extraction_type
255 285
       (interpolated['type'] || begin
256
-        if interpolated['url'] =~ /\.(rss|xml)$/i
286
+        case interpolated['url']
287
+        when /\.(rss|xml)$/i
257 288
           "xml"
258
-        elsif interpolated['url'] =~ /\.json$/i
289
+        when /\.json$/i
259 290
           "json"
291
+        when /\.(txt|text)$/i
292
+          "text"
260 293
         else
261 294
           "html"
262 295
         end
@@ -271,6 +304,8 @@ module Agents
271 304
           JSON.parse(data)
272 305
         when "html"
273 306
           Nokogiri::HTML(data)
307
+        when "text"
308
+          data
274 309
         else
275 310
           raise "Unknown extraction type #{extraction_type}"
276 311
       end

+ 52 - 0
spec/models/agents/website_agent_spec.rb

@@ -398,6 +398,58 @@ describe Agents::WebsiteAgent do
398 398
           event.payload['response']['title'].should == "hello!"
399 399
         end
400 400
       end
401
+
402
+      describe "text parsing" do
403
+        before do
404
+          stub_request(:any, /text-site/).to_return(body: <<-EOF, status: 200)
405
+water: wet
406
+fire: hot
407
+          EOF
408
+          site = {
409
+            'name' => 'Some Text Response',
410
+            'expected_update_period_in_days' => '2',
411
+            'type' => 'text',
412
+            'url' => 'http://text-site.com',
413
+            'mode' => 'on_change',
414
+            'extract' => {
415
+              'word' => { 'regexp' => '^(.+?): (.+)$', index: 1 },
416
+              'property' => { 'regexp' => '^(.+?): (.+)$', index: 2 },
417
+            }
418
+          }
419
+          @checker = Agents::WebsiteAgent.new(name: 'Text Site', options: site)
420
+          @checker.user = users(:bob)
421
+          @checker.save!
422
+        end
423
+
424
+        it "works with regexp" do
425
+          @checker.options = @checker.options.merge('extract' => {
426
+            'word' => { 'regexp' => '^(?<word>.+?): (?<property>.+)$', index: 'word' },
427
+            'property' => { 'regexp' => '^(?<word>.+?): (?<property>.+)$', index: 'property' },
428
+          })
429
+
430
+          lambda {
431
+            @checker.check
432
+          }.should change { Event.count }.by(2)
433
+
434
+          event1, event2 = Event.last(2)
435
+          event1.payload['word'].should == 'water'
436
+          event1.payload['property'].should == 'wet'
437
+          event2.payload['word'].should == 'fire'
438
+          event2.payload['property'].should == 'hot'
439
+        end
440
+
441
+        it "works with regexp with named capture" do
442
+          lambda {
443
+            @checker.check
444
+          }.should change { Event.count }.by(2)
445
+
446
+          event1, event2 = Event.last(2)
447
+          event1.payload['word'].should == 'water'
448
+          event1.payload['property'].should == 'wet'
449
+          event2.payload['word'].should == 'fire'
450
+          event2.payload['property'].should == 'hot'
451
+        end
452
+      end
401 453
     end
402 454
 
403 455
     describe "#receive" do